clear all 
set more off 
set maxvar 15000 
clear matrix

*-----------------------------*
* FIGURE D.4, PANEL A
*-----------------------------*

	use "$Mydirectory1/3_Output/2_PooledData_analysis.dta", clear 
    keep if baseline_sample==1
    
    rename wgt_sex_race weight
    
    gen non_log_father = exp(log_father_closest_census_v2) /* KEY */
     
    forval i=1(1)3 {
        gen est_`i'=. 
        gen est_lb_`i' =.
        gen est_ub_`i' =.
    }
    
    tempfile surveys 
    save `surveys'

*--------------------------------------*
*--------------------------------------*

*---------------------------------------*
/* BRING IN EARLY COHORT TSIV 
   RESULTS --- 1910-1930
   (DATA FROM NBER SERVER) */
*---------------------------------------*

use "$Mydirectory2/appendix_d/TSIV_results_1940_1936fix_levels.dta", clear

    keep if est_!=.

    tempfile early
    save `early'

*--------------------------------------*
*--------------------------------------*

*-----------------------------*
* TSIV RESULTS--- 1940-1970
*-----------------------------*

    local father_inc "non_log_father"
    
* Later decades: 1940-1970 

    foreach x1 in 4 5 6 7   {

        if "`x1'"=="4" local x2 "5" //use 1950 census here
        if "`x1'"=="5" local x2 "6"
        if "`x1'"=="6" local x2 "7"
        if "`x1'"=="7" local x2 "8"

    * Bring in corresponding Census 
        use "$CensusData/output/Census19`x2'0_fathers_ages30to50.dta", clear
        
    * Rename a couple variables
        if "`x1'"=="4" {
            clonevar HHinc = inctot
            rename slwt weight
        }
        else { 
            rename perwt weight
        }
        
    * Re-center weight to have mean 1
        quietly sum weight, d
        replace weight = weight/`r(mean)'
        quietly sum weight, d
        assert `r(mean)'==1
     
    * Append with survey data   
        append using `surveys' 
        replace census=0 if census==.
        
    * Restrict sample and set up triplets
        keep if census==1 | (census==0 & decade==19`x1'0) //keep census and relevant decade 
        drop if fatheroccej==99 //99 = non-working fathers
        
        local var_list "fatheroccej race south_merge" 
        egen triplet = group(`var_list')
        
        bysort census triplet: gen tag = _n==1 
        bysort triplet: egen number_samples = sum(tag) 
        tab number_samples census 
        keep if number_samples==2 //keep if triplet is in both census and surveys
        
    * Remake the dummies for the triplets that exist in both datasets 
        drop triplet* 
        egen triplet = group(`var_list')
        quietly tab triplet, gen(triplet_)
        local max = `r(r)'
        local max_minus1 = `r(r)'-1 //grabs # of dummies minus 1
        drop triplet_`max'

        
    * Save the dataset to be used for all regressions 
        tempfile restricted_data
        save `restricted_data'
        
    *------------------------------* 
    *------------------------------* 
   
    * 1. Baseline prediction using imputation 

        reg fam_inc_real `father_inc' if census==0  [aw=weight], robust 
        
            replace est_1 = _b[`father_inc'] 
            replace est_ub_1 = _b[`father_inc']+1.96*_se[`father_inc'] 
            replace est_lb_1 = _b[`father_inc']-1.96*_se[`father_inc'] 
            
    * 2. TSIV without any corrections --- naive approach 

        quietly reg HHinc triplet_* if  census==1 [aw=weight] 
        predict yhat if census==0 
            
        reg fam_inc_real yhat  if census==0 [aw=weight], robust 
            replace est_2 = _b[yhat] 
            replace est_ub_2 = _b[yhat]+1.96*_se[yhat] 
            replace est_lb_2 = _b[yhat]-1.96*_se[yhat] 
                            
        tempfile results
        save `results'
              
    /* 3. TSIV correcting standard errors based on two sample uncertainty 
          Source: (https://ars.els-cdn.com/content/image/1-s2.0-S0165176516302373-mmc1.pdf) */
        
        * First stage  
        keep if census==1
        gen const=1 
        gen x1 = HHinc
            
        //robust 
        quietly reg x1 triplet_* [aw=weight], robust
        mat Vx2het = e(V)
        
        //non-robust
        quietly sureg (x1 = triplet_*) [aw=weight]
        mat Vx2hom = e(V)
        
        * Second stage 
        use `restricted_data', clear
        keep if census==0
        gen y = fam_inc_real 
        
        predict x1h, equation(x1) //generate predicted x using Census results from sureg
        
        scalar kx = 1 //number of predicted variables
        scalar ke = 1 //number of exogenous variables (constant)
        
        quietly reg y triplet_* [aw=weight]
        mat Vy1hom = e(V)*e(df_r)/_N
        
        quietly reg y triplet_* [aw=weight], robust  
        mat Vy1het = e(V)*e(df_r)/_N
        
        /* TS2SLS estimator */
        reg y x1h [aw=weight]
        scalar coeff1 = _b[x1h]
        display coeff1
        mat b2s = e(b)
        mat b2sx = b2s[1,1..kx]
        
        /* Constructing C hat */
        quietly reg triplet_1 x1h [aw=weight]
        mat ch = e(b)'
        forval b=2(1)`max_minus1' {
        quietly reg triplet_`b' x1h [aw=weight]
        mat ch = ch,e(b)'   
        }
        mat ch = ch,(J(kx,ke,0)\I(ke))
        
        * Calculating non-robust standard errors 
        mat var1hom = ch*Vy1hom*ch' + (b2sx' # ch)*Vx2hom*(b2sx # ch')
        mat seb2shom = vecdiag(cholesky(diag(vecdiag(var1hom))))'
        
        * Calculating robust standard errors 
        mat var1het = ch*Vy1het*ch' + (b2sx' # ch)*Vx2het*(b2sx # ch')
        mat seb2shet = vecdiag(cholesky(diag(vecdiag(var1het))))'
        
        mat list seb2shom
        mat list seb2shet
        
        scalar se1=seb2shet[1,1]
        display se1
        
        scalar ub1 = coeff1 + (1.96*se1)
        scalar lb1 = coeff1 - (1.96*se1)
         
        
    * Bring back full dataset; save estimates 
        use `results', clear
        replace est_3 = coeff1 
        replace est_ub_3 = ub1 
        replace est_lb_3 = lb1 
                
    * Save
        bysort decade: keep if _n==1
        drop if decade==.
        keep est_* decade 
        
        tempfile results_`x1'
        save `results_`x1''

    } 
    
* Append results
    use `results_4'
    append using `results_5'
    append using `results_6'
    append using `results_7'
    
    reshape long est_ est_lb_ est_ub_, i(decade) j(estimate)
    replace decade= decade+1 if estimate==2
    replace decade= decade+2 if estimate==3
    
    append using `early'
    sort decade estimate 
        
    drop if decade<1920

* Figure
    #delimit ;
    twoway (scatter est_ decade if estimate==1,  msymbol(circle) mcolor(midblue) msize(medium) yaxis(1)) 
           (rcap est_lb_  est_ub_  decade if estimate==1, lpatter(solid) lcolor(midblue) yaxis(1) lwidth(0.5))
           (scatter est_ decade if estimate==2,  msymbol(square) mcolor(pink) msize(small) yaxis(1)) 
           (rcap est_lb_  est_ub_  decade if estimate==2, lpatter(solid) lcolor(pink) yaxis(1) lwidth(0.5) )
            (scatter est_ decade if estimate==3,  msymbol(diamond) mcolor(orange*0.7) msize(small) yaxis(1)) 
           (rcap est_lb_  est_ub_  decade if estimate==3, lpatter(solid) lcolor(orange*0.7) yaxis(1) lwidth(0.5) )
           ,
    xti(" " "Decade of respondent's birth") xlabel(1920(10)1970) xscale(range(1915 1975))
    ylabel(0(.25)1.5, axis(1)) yti("Coefficient" " ", axis(1)) scheme(s1color)
    legend(on ring(0) size(medsmall) row(3) pos(8) order(1 "Imputations" 3 "TS2SLS" 5 "TS2SLS, adj. std. errors"  ))
    xlabel(1920 "1920s" 1930 "1930s" 1940 "1940s" 1950 "1950s" 1960 "1960s" 1970 "1970s", labsize(small) ) ;  
    #delimit cr
    graph export "$Mydirectory2/appendix_d/IGE_imputation_vs_TSIV_blended_no1910_levels.pdf", as(pdf) replace
    